This module builds on code contained in Coronavirus_Statistics_USAF_v004.Rmd. This file includes the latest code for analyzing data from USA Facts. USA Facts maintains data on cases and deaths by county for coronavirus in the US. Downloaded data are unique by county with date as a column and a separate file for each of cases, deaths, and population.
The intent of this code is to source updated functions that allow for readRunUSAFacts() to be run to obtain, read, process, and analyze data from USA Facts.
The tidyverse library is loaded, and the functions used for CDC daily processing are sourced. Additionally, specific functions for USA Facts are also sourced:
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.1.1 v dplyr 1.0.6
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
# Functions are available in source file
source("./Generic_Added_Utility_Functions_202105_v001.R")
source("./Coronavirus_CDC_Daily_Functions_v001.R")
source("./Coronavirus_USAF_Functions_v001.R")
Further, the mapping file specific to USA Facts is sourced:
source("./Coronavirus_USAF_Default_Mappings_v001.R")
A few functions should be added back to Generic_Added_…v001.R and Coronavirus_CDC_Daily…_v001.R after they have been more thoroughly checked for compatibility with the state-based clustering. For now, they are included below so as to over-write the function obtained from source(…):
# Updated function for handling county-level clusters
createSummary <- function(df,
stateClusterDF=NULL,
brewPalette=NA,
dataType="state"
) {
# FUNCTION ARGUMENTS:
# df: an integrated data frame by cluster-date
# stateClusterDF: a data frame containing state-cluster (NULL means it can be found in df)
# brewPalette: character string for a palette from RColorBrewer to be used (NA means default colors)
# dataType: the type of maps being produced ("state" or "county")
# Create plots that can be relevant for a dashboard, including:
# 1. Map of segments
# 2. Bar plot of counts by segment
# 3. Facetted bar plot of segment descriptors (e.g., population, burden per million)
# 4. Facetted trend-line plot of burden by segments
# Create a map of the clusters
p1 <- helperSummaryMap(if(is.null(stateClusterDF)) df else stateClusterDF,
mapLevel=if(dataType=="state") "states" else "counties",
keyCol=if(dataType=="state") "state" else "countyFIPS",
discreteValues=TRUE,
labelScale=is.na(brewPalette),
textLabel=if(dataType=="state") c("RI", "CT", "DE", "MD", "DC") else c(),
extraArgs=if(is.na(brewPalette)) list() else
list("arg1"=scale_fill_brewer("Cluster", palette=brewPalette))
)
# Create a bar plot of counts by segment
p2 <- helperSummaryMap(if(is.null(stateClusterDF)) df else stateClusterDF,
mapLevel=if(dataType=="state") "states" else "counties",
keyCol=if(dataType=="state") "state" else "countyFIPS",
discreteValues=TRUE,
labelScale=is.na(brewPalette),
countOnly=TRUE,
extraArgs=if(is.na(brewPalette)) list() else
list("arg1"=scale_fill_brewer("Cluster", palette=brewPalette))
)
# Create plot for population and burden by cluster
p3 <- df %>%
helperAggTotal(aggVars=c("pop", "wm_tcpm7", "wm_tdpm7"),
mapper=c("pop"="Population (millions)",
"wm_tcpm7"="Cases per thousand",
"wm_tdpm7"="Deaths per million"
),
xLab=NULL,
yLab=NULL,
title=NULL,
divideBy=c("pop"=1000000, "wm_tcpm7"=1000),
extraArgs=if(is.na(brewPalette)) list() else
list("arg1"=scale_fill_brewer("Cluster", palette=brewPalette))
)
# Create plot for cumulative burden per million over time
p4xtra <- list(arg1=scale_x_date(date_breaks="2 months", date_labels="%b-%y"),
arg2=theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
)
if(!is.na(brewPalette)) p4xtra$arg3 <- scale_color_brewer("Cluster", palette=brewPalette)
p4 <- df %>%
helperAggTrend(aggVars=append(c("wm_tcpm7", "wm_tdpm7"), if(dataType=="state") "wm_hpm7" else NULL),
mapper=c("wm_tcpm7"="Cases per thousand\n(cumulative)",
"wm_tdpm7"="Deaths per million\n(cumulative)",
"wm_hpm7"="Hospitalized per million\n(current)"
),
yLab=NULL,
title=NULL,
divideBy=c("wm_tcpm7"=1000),
linesize=0.75,
extraArgs=p4xtra
)
list(p1=p1, p2=p2, p3=p3, p4=p4)
}
# Helper function to make a summary map
helperSummaryMap <- function(df,
mapLevel="states",
keyCol="state",
values="cluster",
discreteValues=NULL,
legend.position="right",
labelScale=TRUE,
extraArgs=list(),
countOnly=FALSE,
textLabel=c(),
...
) {
# FUNCTION ARGUMENTS:
# df: a data frame containing a level of geography and an associated cluster
# mapLevel: a parameter for whether the map is "states" or "counties"
# keyCol: the key column for plotting (usmap::plot_usmap is particular, and this must be 'state' or 'fips')
# values: the character name of the field containing the data to be plotted
# discreteValues: boolean for whether the values are discrete (if not, use continuous)
# NULL means infer from data
# legend.position: character for the location of the legend in the plot
# labelScale: boolean, should an scale_fill_ be created? Use FALSE if contained in extraArgs
# extraArgs: list of other arguments that will be appended as '+' to the end of the usmap::plot_usmap call
# countOnly: should a bar plot of counts only be produced?
# textLabel: a list of elements that should be labelled as text on the plot (too small to see)
# ...: other parameters to be passed to usmap::plot_usmap (e.g., labels, include, exclude, etc.)
# Modify the data frame to contain only the relevant data
df <- df %>%
select(all_of(c(keyCol, values))) %>%
distinct()
# Determine the type of data being plotted
if (is.null(discreteValues)) discreteValues <- !is.numeric(df[[values]])
# Convert data type if needed
if (isTRUE(discreteValues) & is.numeric(df[[values]]))
df[[values]] <- factor(df[[values]])
# If count only is needed, create a count map; otherwise create a map
if (isTRUE(countOnly)) {
gg <- df %>%
ggplot(aes(x=fct_rev(get(values)))) +
geom_bar(aes_string(fill=values)) +
stat_count(aes(label=..count.., y=..count../2),
geom="text",
position="identity",
fontface="bold"
) +
coord_flip() +
labs(y="Number of members", x="")
} else {
if(keyCol=="countyFIPS") {
df <- df %>% colRenamer(vecRename=c("countyFIPS"="fips"))
keyCol <- "fips"
}
gg <- usmap::plot_usmap(regions=mapLevel, data=df, values=values, ...)
if (length(textLabel) > 0) {
labDF <- df %>%
filter(get(keyCol) %in% textLabel) %>%
mutate(rk=match(get(keyCol), textLabel)) %>%
arrange(rk) %>%
mutate(lon=-70.1-seq(0, 0.8*length(textLabel)-0.8, by=0.8),
lat=40.1-seq(0, 1.5*length(textLabel)-1.5, by=1.5)
) %>%
select(lon, lat, everything()) %>%
usmap::usmap_transform()
gg <- gg + geom_text(data=labDF,
aes(x=lon.1, y=lat.1, label=paste(get(keyCol), get(values))),
size=3.25
)
}
}
# Position the legend as requested
gg <- gg + theme(legend.position=legend.position)
# Create the scale if appropriate
if (isTRUE(labelScale)) gg <- gg +
if(isTRUE(discreteValues)) scale_fill_discrete(values) else scale_fill_continuous(values)
# Apply extra arguments
for (ctr in seq_along(extraArgs)) gg <- gg + extraArgs[[ctr]]
# Return the map object
gg
}
# Function to pivot the data file longer
pivotData <- function(df,
pivotKeys,
nameVar="name",
valVar="value",
toLonger=TRUE,
...
) {
# FUNCTION ARGUMENTS:
# df: the data frame
# pivotKeys: the keys (everything but cols for pivot_longer, id_cols for pivot_wider)
# nameVar: variable name for names_to or names_from
# valVar: variable name for values_to or values_from
# toLonger: boolean, should pivot_longer() be used rather than pivot_wider()?
# ...: other arguments to be passed to pivot_*()
if (isTRUE(toLonger)) pivot_longer(df, -all_of(pivotKeys), names_to=nameVar, values_to=valVar, ...)
else pivot_wider(df, all_of(pivotKeys), names_from=all_of(nameVar), values_from=all_of(valVar), ...)
}
An existing processed USA Facts list is loaded for use as the comparison set:
cty_newformat_20201026 <- readFromRDS("cty_newformat_20201026")
A function is added to create a county-level cluster map with state borders:
sparseCountyClusterMap <- function(vec,
clustRemap=c("999"=NA),
caption=NULL,
brewPalette=NULL,
naFill="white"
) {
# FUNCTION ARGUMENTS:
# vec: a named vector where the names are countyFIPS and the values are the clusters
# can also be a data.frame, in which case clustersToFrame() and clustRemap are not used
# clustRemap: remapping vector for clusters
# caption: caption to be included (NULL means no caption)
# brewPalette: name of a palette for scale_fill_brewer()
# NULL means use scale_fill_discrete() instead
# naFill: fill to use for NA counties
# Convert to a tibble for use with usmap if not already a data.frame
if ("data.frame" %in% class(vec)) {
df <- vec
} else {
df <- clustersToFrame(vec, colNameName="fips", convFactor=FALSE) %>%
mutate(cluster=as.character(cluster),
cluster=ifelse(cluster %in% names(clustRemap), clustRemap[cluster], cluster)
)
}
# Create a blank state map with black lines
blankStates <- usmap::plot_usmap("states")
# Create a county cluster map with NA values excluded
dataCounties <- df %>%
filter(!is.na(cluster)) %>%
usmap::plot_usmap(regions="counties", data=., values="cluster")
# Integrate as a ggplot object
p1 <- ggplot() +
geom_polygon(data=dataCounties[[1]],
aes(x=x, y=y, group=group, fill=dataCounties[[1]]$cluster),
color = NA,
size = 0.1
) +
geom_polygon(data=blankStates[[1]],
aes(x=x, y=y, group=group),
color = "black",
lwd=1.25,
fill = alpha(0.001)
) +
coord_equal() +
ggthemes::theme_map()
# Add the appropriate fill (can use viridis_d also)
if (is.null(brewPalette)) p1 <- p1 + scale_fill_discrete("Cluster", na.value=naFill)
else if (brewPalette=="viridis") p1 <- p1 + scale_fill_viridis_d("Cluster", na.value=naFill)
else p1 <- p1 + scale_fill_brewer("Cluster", palette=brewPalette, na.value=naFill)
# Add caption if requested
if (!is.null(caption)) p1 <- p1 + labs(caption=caption)
# Return the plotting object
p1
}
The function is then run using previously created segments:
sparseCountyClusterMap(cty_newformat_20201026$useClusters,
brewPalette="Paired",
caption="Counties with population under 25k are blank"
)
Next steps are to load new data and compare against previous, while using existing segments:
readList <- list("usafCase"="./RInputFiles/Coronavirus/covid_confirmed_usafacts_downloaded_20210608.csv",
"usafDeath"="./RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20210608.csv"
)
compareList <- list("usafCase"=cty_newformat_20201026$dfRaw$usafCase,
"usafDeath"=cty_newformat_20201026$dfRaw$usafDeath
)
# Create new clusters
cty_newdata_20210608 <- readRunUSAFacts(maxDate="2021-06-06",
downloadTo=lapply(readList,
FUN=function(x) if(file.exists(x)) NA else x
),
readFrom=readList,
compareFile=compareList,
writeLog="./RInputFiles/Coronavirus/USAF_NewData_20210608_v005.log",
ovrwriteLog=TRUE,
useClusters=cty_newformat_20201026$useClusters,
skipAssessmentPlots=FALSE,
brewPalette="Paired"
)
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## `County Name` = col_character(),
## State = col_character(),
## StateFIPS = col_character()
## )
## i Use `spec()` for the full column specifications.
##
## *** File has been checked for uniqueness by: countyFIPS countyName state stateFIPS
##
##
## *** File has been checked for uniqueness by: countyFIPS stateFIPS date
##
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference:
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 225
## Detailed differences available in: ./RInputFiles/Coronavirus/USAF_NewData_20210608_v005.log
##
## Checking for similarity of: county
## In reference but not in current: 00001 02270 06000
## In current but not in reference:
##
##
## ***Differences of at least 5 and at least 5%
##
## 14 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210608_v005.log
##
##
## ***Differences of at least 0 and at least 0.1%
##
## 551 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210608_v005.log
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## `County Name` = col_character(),
## State = col_character(),
## StateFIPS = col_character()
## )
## i Use `spec()` for the full column specifications.
##
## *** File has been checked for uniqueness by: countyFIPS countyName state stateFIPS
##
##
## *** File has been checked for uniqueness by: countyFIPS stateFIPS date
##
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference:
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 225
## Detailed differences available in: ./RInputFiles/Coronavirus/USAF_NewData_20210608_v005.log
##
## Checking for similarity of: county
## In reference but not in current: 00001 02270 06000
## In current but not in reference:
##
##
## ***Differences of at least 5 and at least 5%
##
## 29 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210608_v005.log
##
##
## ***Differences of at least 0 and at least 0.1%
##
## 640 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210608_v005.log
##
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 4
## isType cases new_cases n
## <chr> <dbl> <dbl> <dbl>
## 1 before 6.20e+9 3.28e+7 1602886
## 2 after 6.18e+9 3.27e+7 1577284
## 3 pctchg 4.58e-3 3.84e-3 0.0160
##
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 4
## isType deaths new_deaths n
## <chr> <dbl> <dbl> <dbl>
## 1 before 1.26e+8 590404 1602886
## 2 after 1.25e+8 589015 1577284
## 3 pctchg 3.75e-3 0.00235 0.0160
## NULL
sparseCountyClusterMap(cty_newdata_20210608$useClusters,
brewPalette="Paired",
caption="Counties with population under 25k are blank"
)
There has been significant convergence among segments in average deaths per million and cases per million. This is suggestive of several possibilities, such as that growth in burden may be inversely proportional to previous burden. Next steps are to create segments using the most recent data, seeking to identify differences in 1) cumulative burden, primarily defined by deaths, and 2) shape of the curve in getting to cumulative burden.
New segments are created, with assessments:
readList <- list("usafCase"="./RInputFiles/Coronavirus/covid_confirmed_usafacts_downloaded_20210608.csv",
"usafDeath"="./RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20210608.csv"
)
# Create new clusters
cty_newsegs_20210608 <- readRunUSAFacts(maxDate="2021-06-06",
writeLog="./RInputFiles/Coronavirus/USAF_NewSegs_20210608_v005.log",
ovrwriteLog=TRUE,
dfPerCapita=cty_newdata_20210608$dfPerCapita,
useClusters=NULL,
skipAssessmentPlots=FALSE,
brewPalette="Paired",
defaultCluster="999",
minPopCluster=40000,
hierarchical=NA,
minShape="2020-04",
maxShape="2021-05",
ratioDeathvsCase = 5,
ratioTotalvsShape = 0.5,
minDeath=100,
minCase=5000,
hmlSegs=3,
eslSegs=3,
seed=2106091243
)
##
## *** File has been checked for uniqueness by: countyFIPS date
## NULL
sparseCountyClusterMap(cty_newsegs_20210608$useClusters,
brewPalette="Paired",
caption="Counties with population under 40k are blank"
)
The function createSummary() is updated to 1) create the sparse summary map; and 2) exclude the small county segment from select plots:
# Function to create diagnoses and plots for clustering data
diagnoseClusters <- function(lst,
lstExtract=fullListExtract,
clusterFrame=NULL,
brewPalette=NA,
clusterType="state",
printSummary=TRUE,
printDetailed=TRUE
) {
# FUNCTION ARGUMENTS:
# lst: a list containing processed clustering data
# lstExtract: the elements to extract from lst with an optional function for converting the elements
# NULL means use the extracted element as-is
# clusterFrame: tibble of the clusters to be plotted
# NULL means create from lst
# brewPalette: the color palette to use with scale_*_brewer()
# default NA means use the standard color/fill profile
# clusterType: character variable of form "state" for state clusters and "county" for county
# printSummary: boolean, should summary plots be printed to the log?
# printDetailed: boolean, should detailed plots be printed to the log?
# Get the key variable (used for joins and the like)
if (clusterType=="state") keyVar <- "state"
else if (clusterType=="county") keyVar <- "countyFIPS"
else stop(paste0("\nThe passed clusterType: ", clusterType, " is not programmed\n"))
# Create clusterFrame from lst if it has been passed as NULL
if (is.null(clusterFrame)) clusterFrame <- clustersToFrame(lst, colNameName=keyVar)
# Create the integrated and aggregate data from lst
dfFull <- integrateData(lst, lstExtract=lstExtract, otherDF=list(clusterFrame), keyJoin=keyVar)
dfAgg <- combineAggData(dfFull, aggBy=plotCombineAggByMapper[[clusterType]])
# Create the main summary plots
summaryPlots <- createSummary(dfAgg,
stateClusterDF=clusterFrame,
brewPalette=brewPalette,
dataType=clusterType
)
# Create the detailed summaries
detPlots <- createDetailedSummaries(dfDetail=dfFull,
dfAgg=dfAgg,
detVar=keyVar,
p2DetMetrics=plotCombineAggByMapper[[clusterType]]$agg2$aggVars,
brewPalette=brewPalette
)
# Print the summary plots if requested (use the sparse county plotting)
if (isTRUE(printSummary)) {
gridExtra::grid.arrange(summaryPlots$p5 + theme(legend.position="none"),
summaryPlots$p3 + theme(legend.position="left"),
summaryPlots$p4,
layout_matrix=rbind(c(1, 2),
c(3, 3)
)
)
}
# Print the detailed plots if requested
if (isTRUE(printDetailed)) purrr::walk(detPlots, .f=print)
# Return a list of the key plotting files
list(dfFull=dfFull,
dfAgg=dfAgg,
plotClusters=clusterFrame,
summaryPlots=summaryPlots,
detPlots=detPlots
)
}
# Function to create detailed cluster summaries
createDetailedSummaries <- function(dfDetail,
dfAgg,
aggVar=c("cluster"),
detVar=c("state"),
p1Metrics=c("tcpm", "tdpm"),
p1Order=c("tdpm"),
p2DetMetrics=c("tcpm7", "tdpm7", "cpm7", "dpm7", "hpm7"),
p2AggMetrics=paste0("wm_", p2DetMetrics),
p3Metrics=p1Metrics,
p3Days=30,
p3Slope=0.25,
mapper=c("tcpm"="Cases per million\n(cumulative)",
"tdpm"="Deaths per million\n(cumulative)",
"cpm7"="Cases\nper million",
"dpm7"="Deaths\nper million",
"hpm7"="Hospitalized\nper million",
"tdpm7"="Deaths (cum)\nper million",
"tcpm7"="Cases (cum)\nper million"
),
brewPalette=NA
) {
# FUNCTION ARGUMENTS:
# dfDetail: data frame or tibble containing detailed (sub-cluster) data
# dfAgg: data frame or tibble containing aggregated (cluster) data
# aggVar: variable reflecting the aggregate level
# detVar: variable reflecting the detailed level
# p1Metrics: metrics to be shown for plot 1 (will be faceted)
# p1Order: variable for ordering detVar in p1
# p2DetMetrics: variables to be included from dfDetail for p2
# p2AggMetrics: corresponding variables to be included from dfAgg for p2
# p3Metrics: metrics to be included in the growth plots
# p3Days: number of days to include for calculating growth
# p3Slope: the slope for the dashed line for growth in p3
# mapper mapping file for variable name to descriptive name
# brewPalette: character string for a palette from RColorBrewer to be used (NA means default colors)
# Create plot for aggregates by sub-cluster
if(detVar=="state") {
p1 <- dfDetail %>%
colSelector(vecSelect=c(detVar, aggVar, "date", p1Metrics)) %>%
pivot_longer(all_of(p1Metrics)) %>%
filter(!is.na(value)) %>%
group_by_at(detVar) %>%
filter(date==max(date)) %>%
ungroup() %>%
ggplot(aes(x=fct_reorder(get(detVar),
value,
.fun=function(x) { max(ifelse(name==p1Order, x, 0)) }
),
y=value
)
) +
geom_col(aes(fill=get(aggVar))) +
facet_wrap(~mapper[name], scales="free_x") +
coord_flip() +
labs(title="Cumulative burden", x=NULL, y=NULL)
if (!is.na(brewPalette)) p1 <- p1 + scale_fill_brewer("Cluster", palette=brewPalette)
} else {
# Do not create the plot for other than state-level data
p1 <- NULL
}
# Create facetted burden trends by aggregate
# For state-level, create each state as a line
# For anything else, create a 10%-90% range
if (detVar=="state") {
p2 <- dfDetail %>%
colSelector(vecSelect=c(detVar, "date", aggVar, p2DetMetrics)) %>%
pivot_longer(all_of(p2DetMetrics)) %>%
filter(!is.na(value)) %>%
ggplot(aes(x=date, y=value)) +
geom_line(aes_string(group=detVar), color="grey", size=0.5) +
facet_grid(mapper[name] ~ get(aggVar), scales="free_y") +
scale_x_date(date_breaks="2 months", date_labels="%b-%y") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
labs(x=NULL, y=NULL, title="Burden by cluster and metric")
} else {
p2 <- dfDetail %>%
colSelector(vecSelect=c(detVar, "date", aggVar, p2DetMetrics)) %>%
pivot_longer(all_of(p2DetMetrics)) %>%
filter(!is.na(value)) %>%
group_by_at(c(aggVar, "name", "date")) %>%
summarize(p10=unname(quantile(value, 0.1)), p90=unname(quantile(value, 0.9)), .groups="drop") %>%
ggplot(aes(x=date)) +
geom_ribbon(aes(ymin=p10, ymax=p90), alpha=0.75) +
facet_grid(mapper[name] ~ get(aggVar), scales="free_y") +
scale_x_date(date_breaks="2 months", date_labels="%b-%y") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
labs(x=NULL,
y=NULL,
title="Burden by cluster and metric",
subtitle="Shaded region is 10%le through 90%le, solid line is weighted mean"
)
}
aggPlot <- dfAgg %>%
colSelector(vecSelect=c("date", aggVar, p2AggMetrics)) %>%
colRenamer(vecRename=purrr::set_names(p2DetMetrics, p2AggMetrics)) %>%
pivot_longer(all_of(p2DetMetrics)) %>%
filter(!is.na(value))
p2 <- p2 +
geom_line(data=aggPlot, aes_string(color=aggVar, group=aggVar, y="value"), size=1.5)
if (!is.na(brewPalette)) p2 <- p2 +
scale_color_brewer("Cluster", palette=brewPalette) +
theme(legend.position="none")
# Create growth trends plot
if (TRUE) {
if(detVar=="countyFIPS") p3 <- dfDetail %>% filter(cluster != "999") else p3 <- dfDetail
p3 <- p3 %>%
colSelector(vecSelect=c(detVar, aggVar, "date", p3Metrics)) %>%
pivot_longer(all_of(p3Metrics)) %>%
filter(!is.na(value)) %>%
group_by_at(c(detVar, "name")) %>%
filter(date %in% c(max(date), max(date)-lubridate::days(p3Days))) %>%
mutate(growth=max(value)-min(value)) %>% # not ideal way to calculate
filter(date==max(date)) %>%
ungroup() %>%
ggplot(aes(x=value, y=growth))
if(detVar=="state") p3 <- p3 + geom_text(aes_string(color=aggVar, label=detVar), fontface="bold")
else p3 <- p3 + geom_point(aes_string(color=aggVar))
p3 <- p3 +
facet_wrap(~mapper[name], scales="free") +
labs(title=paste0("Current vs growth"),
subtitle=paste0("Dashed line represents ",
round(100*p3Slope),
"% growth rate over past ",
p3Days,
" days"
),
x="Most recent cumulative",
y=paste0("Growth over past ", p3Days, " days")
) +
lims(y=c(0, NA), x=c(0, NA)) +
theme(panel.background = element_rect(fill = "white", colour = "white"),
panel.grid.major = element_line(size = 0.25, linetype = 'solid', color = "grey")
) +
geom_abline(slope=p3Slope, intercept=0, lty=2)
if (!is.na(brewPalette)) {
p3 <- p3 + scale_color_brewer(stringr::str_to_title(aggVar), palette=brewPalette)
}
if(detVar=="countyFIPS") p3 <- p3 + labs(caption="Counties below population threshold excluded")
} else {
p3 <- NULL
}
# Return a list of plot objects
list(p1=p1, p2=p2, p3=p3)
}
# Updated function for handling county-level clusters
createSummary <- function(df,
stateClusterDF=NULL,
brewPalette=NA,
dataType="state"
) {
# FUNCTION ARGUMENTS:
# df: an integrated data frame by cluster-date
# stateClusterDF: a data frame containing state-cluster (NULL means it can be found in df)
# brewPalette: character string for a palette from RColorBrewer to be used (NA means default colors)
# dataType: the type of maps being produced ("state" or "county")
# Create plots that can be relevant for a dashboard, including:
# 1. Map of segments
# 2. Bar plot of counts by segment
# 3. Facetted bar plot of segment descriptors (e.g., population, burden per million)
# 4. Facetted trend-line plot of burden by segments
# 5. Sparse county cluster map (if dataType is "county")
# Create a map of the clusters
p1 <- helperSummaryMap(if(is.null(stateClusterDF)) df else stateClusterDF,
mapLevel=if(dataType=="state") "states" else "counties",
keyCol=if(dataType=="state") "state" else "countyFIPS",
discreteValues=TRUE,
labelScale=is.na(brewPalette),
textLabel=if(dataType=="state") c("RI", "CT", "DE", "MD", "DC") else c(),
extraArgs=if(is.na(brewPalette)) list() else
list("arg1"=scale_fill_brewer("Cluster", palette=brewPalette))
)
# Create a bar plot of counts by segment
p2 <- helperSummaryMap(if(is.null(stateClusterDF)) df else stateClusterDF,
mapLevel=if(dataType=="state") "states" else "counties",
keyCol=if(dataType=="state") "state" else "countyFIPS",
discreteValues=TRUE,
labelScale=is.na(brewPalette),
countOnly=TRUE,
extraArgs=if(is.na(brewPalette)) list() else
list("arg1"=scale_fill_brewer("Cluster", palette=brewPalette))
)
# Create plot for population and burden by cluster
p3 <- df %>%
helperAggTotal(aggVars=c("pop", "wm_tcpm7", "wm_tdpm7"),
mapper=c("pop"="Population (millions)",
"wm_tcpm7"="Cases per thousand",
"wm_tdpm7"="Deaths per million"
),
xLab=NULL,
yLab=NULL,
title=NULL,
divideBy=c("pop"=1000000, "wm_tcpm7"=1000),
extraArgs=if(is.na(brewPalette)) list() else
list("arg1"=scale_fill_brewer("Cluster", palette=brewPalette))
)
# Create plot for cumulative burden per million over time
p4xtra <- list(arg1=scale_x_date(date_breaks="2 months", date_labels="%b-%y"),
arg2=theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
)
if(!is.na(brewPalette)) p4xtra$arg3 <- scale_color_brewer("Cluster", palette=brewPalette)
p4 <- df %>%
helperAggTrend(aggVars=append(c("wm_tcpm7", "wm_tdpm7"), if(dataType=="state") "wm_hpm7" else NULL),
mapper=c("wm_tcpm7"="Cases per thousand\n(cumulative)",
"wm_tdpm7"="Deaths per million\n(cumulative)",
"wm_hpm7"="Hospitalized per million\n(current)"
),
yLab=NULL,
title=NULL,
divideBy=c("wm_tcpm7"=1000),
linesize=0.75,
extraArgs=p4xtra
)
# Create the sparse county plot (if it is county data) - assumes default that '999' is 'too small'
if (dataType=="county") {
vecFrame <- if(is.null(stateClusterDF)) df else stateClusterDF
vecFrame <- vecFrame %>% select(countyFIPS, cluster) %>%
distinct()
vecUse <- vecFrame$cluster %>% purrr::set_names(vecFrame$countyFIPS)
p5 <- sparseCountyClusterMap(vecUse,
caption="Counties below population threshold left blank",
brewPalette=if(is.na(brewPalette)) NULL else brewPalette
)
} else {
p5 <- NULL
}
list(p1=p1, p2=p2, p3=p3, p4=p4, p5=p5)
}
The updated functions are tested:
# Confirm that new cluster maps are working as intended
cty_chksegs_20210608 <- readRunUSAFacts(maxDate="2021-06-06",
dfPerCapita=cty_newsegs_20210608$dfPerCapita,
useClusters=cty_newsegs_20210608$useClusters,
skipAssessmentPlots=FALSE,
brewPalette="Paired"
)
## NULL
The updated maps blank out the ‘999’ (small population) cluster as desired. The clusters are designed using a 3x3 of total burden (mainly death) and shape of the curve. Shapes of the curve are plotted, normalized so that each curve sums to 100%:
p1Data <- cty_chksegs_20210608$plotDataList$dfAgg %>%
pivotData(pivotKeys=c("cluster", "date", "pop")) %>%
filter(!is.na(value), cluster!="999", name %in% c("wm_cpm7", "wm_dpm7")) %>%
group_by(cluster, name) %>%
mutate(pct=value/sum(value)) %>%
ungroup()
# Plot all together
p1Data %>%
ggplot(aes(x=date, y=pct)) +
geom_line(aes(group=cluster, color=cluster), lwd=1) +
facet_wrap(~c("wm_cpm7"="Rolling 7 cases", "wm_dpm7"="Rolling 7 deaths")[name]) +
scale_color_brewer(palette="Paired") +
labs(x=NULL,
y="Percentage of total burden",
main="Burden over time, normalized to 100% cumulative burden"
)
# Plot as facets
p1Data %>%
ggplot(aes(x=date, y=pct)) +
geom_line(aes(group=cluster, color=cluster), lwd=1) +
facet_grid(c("wm_cpm7"="Rolling 7 cases", "wm_dpm7"="Rolling 7 deaths")[name]~cluster) +
scale_color_brewer(palette="Paired") +
labs(x=NULL,
y="Percentage of total burden",
main="Burden over time, normalized to 100% cumulative burden"
)
Broadly, the shapes selected by the clustering include:
Broadly, there are examples of relatively higher and lower burden within each of these shapes, though the highest burden clusters mainly had two spikes with primary impact early and the medium burden clusters rarely had the primary impact early.
Next, all counties of at least 50,000 population are examined for the shape of the cumulative deaths curve:
p2Data <- cty_chksegs_20210608$plotDataList$dfFull %>%
select(countyFIPS, pop, state, date, tdpm7, tcpm7) %>%
pivotData(pivotKeys=c("countyFIPS", "state", "date", "pop")) %>%
filter(!is.na(value), pop>=50000) %>%
group_by(countyFIPS, name) %>%
mutate(pct=value/max(value)) %>%
ungroup()
p2Dist <- p2Data %>%
filter(name=="tdpm7") %>%
select(countyFIPS, date, pct) %>%
pivotData(pivotKeys="countyFIPS", toLonger = FALSE, nameVar="date", valVar="pct") %>%
column_to_rownames("countyFIPS") %>%
as.matrix() %>%
dist()
p2Complete <- hclust(p2Dist, method="complete")
plot(p2Complete)
p2Single <- hclust(p2Dist, method="single")
plot(p2Single)
There is something peculiar about data in ‘32510’. A plot is created:
p2Data %>%
ggplot(aes(x=date, y=pct)) +
geom_line(data=~filter(., countyFIPS=="32510"), aes(group=countyFIPS), color="red", lwd=2) +
geom_line(data=~summarize(group_by(., name, date), pct=mean(pct), .groups="drop")) +
facet_wrap(~c("tcpm7"="Rolling 7 cases", "tdpm7"="Rolling 7 deaths")[name]) +
labs(x=NULL, y="Percentage of total burden on day", title="Cluster 32510 is red, mean is black")
There is clearly a data issue with cluster 32510. Clusters are examined at a variable number of cut points:
plotHierarchical <- function(obj, k, df) {
# FUNCtION ARGuMENTS:
# obj: clustering object from hierarchical clustering
# k: number of clusters to create
# df: data frame with burden data
# Counts by cluster
ctCluster <- cutree(obj, k=k) %>%
table() %>%
print()
# Make clustering data frame
clData <- cutree(obj, k=k) %>%
clustersToFrame(colNameName="countyFIPS") %>%
joinFrames(df) %>%
group_by(countyFIPS, name) %>%
mutate(delta=ifelse(row_number()==1, pct, pct-lag(pct))) %>%
ungroup() %>%
group_by(cluster, date, name) %>%
summarize(p05cum=quantile(pct, 0.05),
mucum=mean(pct),
p95cum=quantile(pct, 0.95),
p05delta=quantile(delta, 0.05),
mudelta=mean(delta),
p95delta=quantile(delta, 0.95),
.groups="drop"
)
# Plot by cluster and metric
p1 <- clData %>%
ggplot(aes(x=date)) +
geom_line(aes(color=cluster, y=mucum), lwd=1) +
geom_ribbon(aes(ymin=p05cum, ymax=p95cum), alpha=0.25) +
facet_grid(c("tcpm7"="% cumulative cases", "tdpm7"="% cumulative deaths")[name]~cluster) +
scale_color_brewer(palette="Set1") +
labs(x=NULL, y="% of cumulative", title="Mean and 90% range by cluster and metric")
print(p1)
# Plot incremental rather than cumulative
p2 <- clData %>%
ggplot(aes(x=date)) +
geom_line(aes(color=cluster, y=mudelta), lwd=1) +
geom_ribbon(aes(ymin=p05delta, ymax=p95delta), alpha=0.25) +
facet_grid(c("tcpm7"="% total cases", "tdpm7"="% total deaths")[name]~cluster) +
scale_color_brewer(palette="Set1") +
labs(x=NULL, y="% of total", title="Mean and 90% range by cluster and metric")
print(p2)
# Return the clustering frame
clData
}
plotList <- lapply(2:5, FUN=function(x) plotHierarchical(p2Complete, k=x, df=p2Data))
## .
## 1 2
## 841 149
## Joining, by = "countyFIPS"
## .
## 1 2 3
## 773 68 149
## Joining, by = "countyFIPS"
## .
## 1 2 3 4
## 498 275 68 149
## Joining, by = "countyFIPS"
## .
## 1 2 3 4 5
## 498 275 68 114 35
## Joining, by = "countyFIPS"
At a glance, with 5 clusters, there is a clear early cluster, a clear early/late cluster, and a group of three seemingly similar clusters (primarily late). Further exploration of these three clusters may be merited.
The five clusters are examined on a single plot:
tempPlotter <- function(df, vecLate, facetScales="free_y", showRibbon=TRUE, showCum=FALSE) {
p1 <- df %>%
mutate(lateCluster=(cluster %in% vecLate)) %>%
ggplot(aes(x=date)) +
geom_line(aes(group=cluster, color=cluster, y=if(showCum) mucum else mudelta), lwd=1) +
facet_grid(lateCluster~c("tcpm7"="% total cases", "tdpm7"="% total deaths")[name],
scales=facetScales
) +
scale_color_brewer(palette="Set1") +
labs(x=NULL,
y=NULL,
title="Disease burden by time period and shape cluster"
)
if (isTRUE(showRibbon)) {
p1 <- p1 +
geom_ribbon(aes(ymin=if(showCum) p05cum else p05delta,
ymax=if(showCum) p95cum else p95delta,
fill=cluster,
group=cluster
), alpha=0.25) +
scale_fill_brewer(palette="Set1") +
labs(subtitle="Shaded region covers 90% of observations")
}
p1
}
tempPlotter(plotList[[4]], vecLate=c(4, 5), showRibbon=FALSE)
tempPlotter(plotList[[4]], vecLate=c(4, 5), showRibbon=FALSE, showCum=TRUE)
tempPlotter(plotList[[4]], vecLate=c(4, 5))
tempPlotter(plotList[[4]], vecLate=c(4, 5), showCum=TRUE)
Visually, there are meaningful distinctions in the clusters when overlaid, particularly with the shape of the cumulative curve:
The geographic locations of the five hierarchical cluster types are plotted:
tempMapper <- function(obj,
k,
regions="counties",
varName=if(regions=="counties") "fips" else "state",
values="cluster",
lvlOrder=NULL,
title=if(is.null(lvlOrder)) NULL else "Lightest colors have earliest burden",
caption=NULL
) {
df <- cutree(obj, k=k) %>%
clustersToFrame(colNameName=varName)
if (!is.null(lvlOrder)) df[[values]] <- factor(df[[values]], levels=lvlOrder)
df %>%
sparseCountyClusterMap(brewPalette="viridis", caption=caption) +
labs(title=title)
}
tmpText <- "Plots only counties with at least 50k population"
tempMapper(p2Complete, k=2, lvlOrder=c(1, 2), caption=tmpText)
tempMapper(p2Complete, k=3, lvlOrder=c(2, 1, 3), caption=tmpText)
tempMapper(p2Complete, k=4, lvlOrder=c(3, 2, 1, 4), caption=tmpText)
tempMapper(p2Complete, k=5, lvlOrder=c(3, 2, 1, 4, 5), caption=tmpText)
Maps are created to show when counties first cleared specific hurdles for cumulative deaths per million:
tempBurdenMapper <- function(df,
popMin=1,
popMax=Inf,
tgtVar="tdpm",
tgtBPM=1000,
fn=lubridate::quarter,
title=paste0("Time when county first hit ", tgtBPM, " on metric ", tgtVar),
caption=paste("Plots counties with population between", popMin, "and", popMax),
...
) {
p1 <- df %>%
filter(pop < popMax, pop >= popMin) %>%
group_by(countyFIPS) %>%
summarize(bpmMax=max(get(tgtVar), na.rm=TRUE),
keyDate=as.Date(specNA(min)(ifelse(get(tgtVar) >= tgtBPM, date, NA)), origin="1970-01-01"),
.groups="drop"
) %>%
mutate(cluster=fn(keyDate, ...),
cluster=ifelse(is.na(cluster), "not yet", cluster),
fips=countyFIPS
) %>%
sparseCountyClusterMap(brewPalette = "viridis", caption=caption)
p1 + labs(title=title)
}
# Looking at tdpm of 500, 1000, 5000
tempBurdenMapper(cty_chksegs_20210608$plotDataList$dfFull, tgtBPM=500, with_year=TRUE)
tempBurdenMapper(cty_chksegs_20210608$plotDataList$dfFull, tgtBPM=1000, with_year=TRUE)
tempBurdenMapper(cty_chksegs_20210608$plotDataList$dfFull, tgtBPM=5000, with_year=TRUE)
# Looking specifically at 2500 tdpm
tempBurdenMapper(cty_chksegs_20210608$plotDataList$dfFull, tgtBPM=2500, with_year=TRUE)
tempBurdenMapper(cty_chksegs_20210608$plotDataList$dfFull, popMin=25000, tgtBPM=2500, with_year=TRUE)
tempBurdenMapper(cty_chksegs_20210608$plotDataList$dfFull, popMax=25000, tgtBPM=2500, with_year=TRUE)
Next steps are to create curves using larger counties, then assess how well they fit the shape of the curves in the smaller counties.
A function is created to calculate the distances from each of the cluster means:
tempDistCluster <- function(dfClust, dfAll, metType="tdpm7", metClust="mucum") {
dfClust <- dfClust %>%
filter(name==metType) %>%
colSelector(vecSelect=c("cluster", "date", metClust)) %>%
colRenamer(vecRename=c("pct") %>% purrr::set_names(metClust))
dfAll <- dfAll %>%
filter(name==metType) %>%
colSelector(vecSelect=c("countyFIPS", "date", "pct"))
dfClust %>%
left_join(dfAll, by="date") %>%
mutate(delta2=(ifelse(is.na(pct.x), 0, pct.x)-ifelse(is.na(pct.y), 0, pct.y))**2) %>%
group_by(cluster, countyFIPS) %>%
summarize(dist=sum(delta2), .groups="drop") %>%
arrange(countyFIPS, dist)
}
# Create data for all counties
p2All <- cty_chksegs_20210608$plotDataList$dfFull %>%
select(countyFIPS, pop, state, date, tdpm7, tcpm7) %>%
pivotData(pivotKeys=c("countyFIPS", "state", "date", "pop")) %>%
filter(!is.na(value), pop>0) %>%
group_by(countyFIPS, name) %>%
mutate(pct=value/max(value)) %>%
ungroup()
# Calculate distances to cluster mean, then print
countyClusterDist <- tempDistCluster(dfClust=plotList[[4]], dfAll=p2All)
countyClusterDist
## # A tibble: 15,710 x 3
## cluster countyFIPS dist
## <ord> <chr> <dbl>
## 1 1 01001 1.20
## 2 2 01001 3.10
## 3 3 01001 5.48
## 4 4 01001 14.8
## 5 5 01001 48.8
## 6 2 01003 0.938
## 7 1 01003 1.88
## 8 3 01003 7.53
## 9 4 01003 19.3
## 10 5 01003 58.0
## # ... with 15,700 more rows
# Assess overlap of cluster for counties already clusterd
tmpDistDF <- cutree(p2Complete, k=5) %>%
vecToTibble(colNameName="countyFIPS", colNameData="hierCluster") %>%
left_join(countyClusterDist, by="countyFIPS") %>%
arrange(countyFIPS, dist)
tmpDistDF %>%
group_by(countyFIPS) %>%
filter(row_number()==1) %>%
ungroup() %>%
mutate(cluster=as.integer(as.character(cluster))) %>%
count(hierCluster, cluster) %>%
group_by(hierCluster) %>%
mutate(pct=n/sum(n)) %>%
ungroup() %>%
ggplot(aes(x=cluster, y=hierCluster)) +
geom_tile(aes(fill=pct)) +
geom_text(aes(label=paste0("n=", n, "\n(", round(100*pct), "%)"))) +
scale_fill_continuous(low="white", high="lightgreen") +
labs(title="Overlap of assigned hierarchical cluster and closest hierarchical cluster mean",
x="Closest hierarchical cluster mean (k=5)",
y="Assigned hierarchical cluster (k=5)"
)
Most counties are closest to the mean of the hierarchical cluster they are assigned to. There are some differences, as expected since method=“complete” was chosen for the hierarchical clusters. Assignments are then made for all counties, and then plotted:
# Full clustering database
distOnlyClust <- countyClusterDist %>%
arrange(countyFIPS, dist) %>%
group_by(countyFIPS) %>%
filter(row_number()==1) %>%
ungroup() %>%
left_join(cutree(p2Complete, k=5) %>%
vecToTibble(colNameName="countyFIPS", colNameData="hierCluster"),
by="countyFIPS"
)
# Cluster assignments depending on size of county
distOnlyClust %>%
ggplot(aes(x=cluster)) +
geom_bar(fill="lightblue") +
geom_text(stat="count", aes(y=..count../2, label=..count..)) +
facet_wrap(~ifelse(is.na(hierCluster), "Not in hierarchical data", "In hierarchical data"),
scales="free_y"
) +
labs(title="Closest hierarchical cluster")
# Plot all counties based on closest cluster
sparseCountyClusterMap(rename(distOnlyClust, fips=countyFIPS) %>%
mutate(cluster=factor(cluster, levels=c(3, 2, 1, 4, 5))),
brewPalette="viridis",
caption="Closest hierarchical cluster mean (euclidean distance of % cumulative death)"
) +
labs(title="Lighter clusters have higher percentage of burden early")
There are clear geographic patterns to the data, with counties having earlier disease generally being surrounded by other counties with (relatively) earlier disease.
The latest burden is also included:
p2Merge <- p2All %>%
filter(date==max(date), name=="tdpm7") %>%
select(countyFIPS, pop, state, date, tdpm7=value) %>%
inner_join(distOnlyClust, by=c("countyFIPS"))
p2Merge %>%
mutate(countySize=ifelse(pop>=50000, ">50k", "<50k")) %>%
ggplot(aes(x=tdpm7)) +
geom_density(aes(color=cluster), size=1) +
facet_wrap(~countySize) +
scale_color_brewer(palette="Set1") +
labs(x="Cumulative deaths per million", y="Relative density", title="Deaths vs shape segments")
Smaller counties are more likely to be very low or very high on deaths per million, as expected given the smaller denominators. Among the larger counties, segment 5 appears to generally have higher deaths, a trend that does not appear to hold in smaller counties.
ECDF curves are created as well:
p2Merge %>%
mutate(countySize=ifelse(pop>=50000, ">50k", "<50k")) %>%
arrange(cluster, countySize, tdpm7) %>%
group_by(cluster, countySize) %>%
mutate(prop=row_number()/n()) %>%
ungroup() %>%
ggplot(aes(x=tdpm7, y=prop)) +
geom_line(aes(group=countySize, color=countySize), size=1) +
facet_wrap(~cluster) +
scale_color_brewer(palette="Set1") +
labs(x="Cumulative deaths per million",
y="ECDF",
title="Deaths vs shape segments",
subtitle="Includes all counties (even with 0 deaths)"
)
p2Merge %>%
mutate(countySize=ifelse(pop>=50000, ">50k", "<50k")) %>%
filter(tdpm7>0) %>%
arrange(cluster, countySize, tdpm7) %>%
group_by(cluster, countySize) %>%
mutate(prop=row_number()/n()) %>%
ungroup() %>%
ggplot(aes(x=tdpm7, y=prop)) +
geom_line(aes(group=cluster, color=cluster), size=1) +
facet_wrap(~countySize) +
scale_color_brewer(palette="Set1") +
labs(x="Cumulative deaths per million",
y="ECDF",
title="Deaths vs shape segments",
subtitle="Includes only counties with 1+ deaths"
)
Larger counties appear to generally have had lower burdens, with the exception of very low burden, where there is some crossing of the curves at around 500 deaths per million.
Suppose that counties are clustered based on cumulative deaths (not percentage) by month, with focus on larger counties first:
# Include all counties with population greater than 0
p2DataAll <- cty_chksegs_20210608$plotDataList$dfFull %>%
select(countyFIPS, pop, state, date, tdpm7, tcpm7) %>%
pivotData(pivotKeys=c("countyFIPS", "state", "date", "pop")) %>%
filter(!is.na(value), pop>=1) %>%
group_by(countyFIPS, name) %>%
mutate(pct=value/max(value)) %>%
ungroup()
# Create distance matrix and hierarchical clusters
p2DistValue100k <- p2DataAll %>%
filter(name=="tdpm7", pop>=100000) %>%
select(countyFIPS, date, value) %>%
pivotData(pivotKeys="countyFIPS", toLonger = FALSE, nameVar="date", valVar="value") %>%
column_to_rownames("countyFIPS") %>%
as.matrix() %>%
dist()
# Run clustering with method "complete"
p2CompleteValue100k <- hclust(p2DistValue100k, method="complete")
plot(p2CompleteValue100k)
There appears to be a branch of counties that are unique:
cutree(p2CompleteValue100k, k=2) %>%
vecToTibble(colNameName="countyFIPS", colNameData="cluster") %>%
group_by(cluster) %>%
mutate(n=n()) %>%
ungroup() %>%
filter(n==min(n)) %>%
left_join(p2DataAll, by="countyFIPS") %>%
left_join(select(getCountyData(), countyFIPS, countyName, state)) %>%
filter(name=="tdpm7") %>%
mutate(countyName=paste0(countyFIPS, " - ",
stringr::str_replace(countyName, " County", ""),
" (",
state,
")"
)
) %>%
ggplot(aes(x=date, y=value)) +
geom_line() +
facet_wrap(~countyName) +
labs(x=NULL, y="Deaths per million", title="Counties in the first, small segment")
## Joining, by = c("countyFIPS", "state")
These counties have high total burdens combined with shapes differing from the norm. This leads them to become their own clusters, requiring a greater number of clusters to get at the main groupings:
# Updated to allow for variable other than pct
plotHierarchical <- function(obj, k, df, keyVar="pct", facetScale="fixed") {
# FUNCtION ARGuMENTS:
# obj: clustering object from hierarchical clustering
# k: number of clusters to create
# df: data frame with burden data
# keyVar: the key variable to use in df
# Counts by cluster
ctCluster <- cutree(obj, k=k) %>%
table() %>%
print()
# Make clustering data frame
clData <- cutree(obj, k=k) %>%
clustersToFrame(colNameName="countyFIPS") %>%
joinFrames(df) %>%
group_by(countyFIPS, name) %>%
mutate(delta=ifelse(row_number()==1, get(keyVar), get(keyVar)-lag(get(keyVar)))) %>%
ungroup() %>%
group_by(cluster, date, name) %>%
summarize(p05cum=quantile(get(keyVar), 0.05),
mucum=mean(get(keyVar)),
p95cum=quantile(get(keyVar), 0.95),
p05delta=quantile(delta, 0.05),
mudelta=mean(delta),
p95delta=quantile(delta, 0.95),
.groups="drop"
)
# Plot by cluster and metric
p1 <- clData %>%
ggplot(aes(x=date)) +
geom_line(aes(color=cluster, y=mucum), lwd=1) +
geom_ribbon(aes(ymin=p05cum, ymax=p95cum), alpha=0.25) +
facet_grid(c("tcpm7"="Cumulative cases", "tdpm7"="Cumulative deaths")[name]~cluster,
scales=facetScale
) +
scale_color_brewer(palette="Set1") +
labs(x=NULL, y="Cumulative", title="Mean and 90% range by cluster and metric")
print(p1)
# Plot incremental rather than cumulative
p2 <- clData %>%
ggplot(aes(x=date)) +
geom_line(aes(color=cluster, y=mudelta), lwd=1) +
geom_ribbon(aes(ymin=p05delta, ymax=p95delta), alpha=0.25) +
facet_grid(c("tcpm7"="Cases", "tdpm7"="Deaths")[name]~cluster, scales=facetScale) +
scale_color_brewer(palette="Set1") +
labs(x=NULL, y="Burden", title="Mean and 90% range by cluster and metric")
print(p2)
# Return the clustering frame
clData
}
plotListValue100k <- lapply(2:9,
FUN=function(x) plotHierarchical(p2CompleteValue100k,
k=x,
df=filter(p2DataAll, pop>=100000),
keyVar="value",
facetScale="free_y"
)
)
## .
## 1 2
## 586 13
## Joining, by = "countyFIPS"
## .
## 1 2 3
## 319 267 13
## Joining, by = "countyFIPS"
## .
## 1 2 3 4
## 319 267 4 9
## Joining, by = "countyFIPS"
## .
## 1 2 3 4 5
## 319 236 4 31 9
## Joining, by = "countyFIPS"
## .
## 1 2 3 4 5 6
## 319 236 4 31 7 2
## Joining, by = "countyFIPS"
## .
## 1 2 3 4 5 6 7
## 319 81 155 4 31 7 2
## Joining, by = "countyFIPS"
## .
## 1 2 3 4 5 6 7 8
## 319 81 155 4 23 8 7 2
## Joining, by = "countyFIPS"
## .
## 1 2 3 4 5 6 7 8 9
## 319 81 144 4 23 11 8 7 2
## Joining, by = "countyFIPS"
With 9 segments, the following shapes and burdens (using deaths as the metric) are noted:
Separating counties with very high burden and very low burden appears reasonable. Clustering can be re-run with only the counties showing moderate disease burden, where shape is a more meaningful factor.
Segments are selected using k=8, with the four smallest buckets consolidated:
dfClust100k_k8 <- cutree(p2CompleteValue100k, k=8) %>%
vecToTibble(colNameData="tempCluster", colNameName="countyFIPS") %>%
group_by(tempCluster) %>%
mutate(cluster=factor(ifelse(n()>=20, tempCluster, 9))) %>%
ungroup()
dfClust100k_k8
## # A tibble: 599 x 3
## countyFIPS tempCluster cluster
## <chr> <int> <fct>
## 1 01003 1 1
## 2 01015 2 2
## 3 01055 2 2
## 4 01069 2 2
## 5 01073 2 2
## 6 01081 1 1
## 7 01089 1 1
## 8 01097 3 3
## 9 01101 2 2
## 10 01103 2 2
## # ... with 589 more rows
count(dfClust100k_k8, cluster)
## # A tibble: 5 x 2
## cluster n
## <fct> <int>
## 1 1 319
## 2 2 81
## 3 3 155
## 4 9 21
## 5 5 23
The plotHierarchical function is updated to allow for passing clusters to it directly:
# Updated to allow for variable other than pct and obj that is of class data.frame
plotHierarchical <- function(obj, k, df, keyVar="pct", facetScale="fixed") {
# FUNCtION ARGuMENTS:
# obj: clustering object from hierarchical clustering OR df with countyFIPS-cluster
# k: number of clusters to create
# df: data frame with burden data
# keyVar: the key variable to use in df
# Convert obj to a data frame if obj has been passed as an hclust object
if (!("data.frame") %in% class(obj)) {
if (!("hclust") %in% class(obj))
stop("\nMust pass object that is member of class data.frame or hclust\n")
# Print counts by cluster
cutree(obj, k=k) %>%
table() %>%
print()
# Convert obj to relevant data frame
obj <- cutree(obj, k=k) %>%
clustersToFrame(colNameName="countyFIPS")
}
# Make clustering data frame (obj has been converted to data.frame above if not already passed that way)
clData <- obj %>%
joinFrames(df) %>%
group_by(countyFIPS, name) %>%
mutate(delta=ifelse(row_number()==1, get(keyVar), get(keyVar)-lag(get(keyVar)))) %>%
ungroup() %>%
group_by(cluster, date, name) %>%
summarize(p05cum=quantile(get(keyVar), 0.05),
mucum=mean(get(keyVar)),
p95cum=quantile(get(keyVar), 0.95),
p05delta=quantile(delta, 0.05),
mudelta=mean(delta),
p95delta=quantile(delta, 0.95),
.groups="drop"
)
# Plot by cluster and metric
p1 <- clData %>%
ggplot(aes(x=date)) +
geom_line(aes(color=cluster, y=mucum), lwd=1) +
geom_ribbon(aes(ymin=p05cum, ymax=p95cum), alpha=0.25) +
facet_grid(c("tcpm7"="Cumulative cases", "tdpm7"="Cumulative deaths")[name]~cluster,
scales=facetScale
) +
scale_color_brewer(palette="Set1") +
labs(x=NULL, y="Cumulative", title="Mean and 90% range by cluster and metric")
print(p1)
# Plot incremental rather than cumulative
p2 <- clData %>%
ggplot(aes(x=date)) +
geom_line(aes(color=cluster, y=mudelta), lwd=1) +
geom_ribbon(aes(ymin=p05delta, ymax=p95delta), alpha=0.25) +
facet_grid(c("tcpm7"="Cases", "tdpm7"="Deaths")[name]~cluster, scales=facetScale) +
scale_color_brewer(palette="Set1") +
labs(x=NULL, y="Burden", title="Mean and 90% range by cluster and metric")
print(p2)
# Return the clustering frame
clData
}
clData_100k_k8 <- plotHierarchical(obj=select(dfClust100k_k8, countyFIPS, cluster),
df=p2DataAll,
keyVar="value",
facetScale="free_y"
)
## Joining, by = "countyFIPS"
Next, the existing algorithm is updated to assign every cluster to the closest cluster mean:
tempDistCluster <- function(dfClust, dfAll, metType="tdpm7", metClust="mucum", typeUse="pct") {
dfClust <- dfClust %>%
filter(name==metType) %>%
colSelector(vecSelect=c("cluster", "date", metClust)) %>%
colRenamer(vecRename=c("keyMetric") %>% purrr::set_names(metClust))
dfAll <- dfAll %>%
filter(name==metType) %>%
colSelector(vecSelect=c("countyFIPS", "date", typeUse)) %>%
colRenamer(vecRename=c("keyMetric") %>% purrr::set_names(typeUse))
dfClust %>%
left_join(dfAll, by="date") %>%
mutate(dx=ifelse(is.na(keyMetric.x), 0, keyMetric.x),
dy=ifelse(is.na(keyMetric.y), 0, keyMetric.y),
delta2=(dx-dy)**2
) %>%
group_by(cluster, countyFIPS) %>%
summarize(dist=sum(delta2), .groups="drop") %>%
arrange(countyFIPS, dist)
}
# Calculate distances to cluster mean, then print
countyClusterDist_new <- tempDistCluster(dfClust=clData_100k_k8, dfAll=p2DataAll, typeUse="value")
countyClusterDist_new
## # A tibble: 15,710 x 3
## cluster countyFIPS dist
## <fct> <chr> <dbl>
## 1 3 01001 9569283.
## 2 2 01001 67090707.
## 3 1 01001 88240896.
## 4 5 01001 353564016.
## 5 9 01001 983953954.
## 6 1 01003 14296132.
## 7 3 01003 59333163.
## 8 2 01003 191161474.
## 9 5 01003 563092256.
## 10 9 01003 1367670103.
## # ... with 15,700 more rows
# Assess overlap of cluster for counties already clustered
tmpDistDF_new <- dfClust100k_k8 %>%
select(countyFIPS, hierCluster=cluster) %>%
left_join(countyClusterDist_new, by="countyFIPS") %>%
arrange(countyFIPS, dist)
tmpDistDF_new %>%
group_by(countyFIPS) %>%
filter(row_number()==1) %>%
ungroup() %>%
mutate(cluster=factor(as.integer(as.character(cluster)))) %>%
count(hierCluster, cluster) %>%
group_by(hierCluster) %>%
mutate(pct=n/sum(n)) %>%
ungroup() %>%
ggplot(aes(x=cluster, y=hierCluster)) +
geom_tile(aes(fill=pct)) +
geom_text(aes(label=paste0("n=", n, "\n(", round(100*pct), "%)"))) +
scale_fill_continuous(low="white", high="lightgreen") +
labs(title="Overlap of assigned hierarchical cluster and closest hierarchical cluster mean",
x="Closest hierarchical cluster mean (k=5)",
y="Assigned hierarchical cluster (k=5)"
)
While there are some minor differences, overwhelmingly counties are already aligned to the closest cluster mean. The approach is then extended to counties with populations under 100k:
# Full clustering database
distOnlyClust_new <- countyClusterDist_new %>%
arrange(countyFIPS, dist) %>%
group_by(countyFIPS) %>%
filter(row_number()==1) %>%
ungroup() %>%
left_join(select(dfClust100k_k8, countyFIPS, hierCluster=cluster), by="countyFIPS")
# Cluster assignments depending on size of county
distOnlyClust_new %>%
ggplot(aes(x=cluster)) +
geom_bar(fill="lightblue") +
geom_text(stat="count", aes(y=..count../2, label=..count..)) +
facet_wrap(~ifelse(is.na(hierCluster), "Not in hierarchical data", "In hierarchical data"),
scales="free_y"
) +
labs(title="Closest hierarchical cluster")
# Plot all counties based on closest cluster
sparseCountyClusterMap(rename(distOnlyClust_new, fips=countyFIPS) %>%
mutate(cluster=factor(cluster, levels=c(1, 3, 2, 9, 5))),
brewPalette="viridis",
caption="Closest hierarchical cluster mean (euclidean distance of tdpm7)"
) +
labs(title="Lighter clusters generally have higher/earlier burden")
There continue to be meaningful geographic trends, with counties frequently adjacent to other counties in the same segment. Smaller counties are over-represented in groups with relatively high burden (9 and 2).
Next steps are to try to bring the shape dimension slightly more to the surface (in particular for segments 2 and 3 which have a fat 90% range early in the pandemic).
Data for segments 2 and 3 are extracted, then clustering is run solely for these groups:
# Create data containing only clusters 2 and 3
p2Data_Seg_23 <- distOnlyClust_new %>%
filter(hierCluster %in% c(2, 3)) %>%
select(countyFIPS, hierCluster) %>%
left_join(p2DataAll, by="countyFIPS")
p2Data_Seg_23
## # A tibble: 234,112 x 8
## countyFIPS hierCluster pop state date name value pct
## <chr> <fct> <dbl> <chr> <date> <chr> <dbl> <dbl>
## 1 01015 2 113605 AL 2020-01-25 tdpm7 0 0
## 2 01015 2 113605 AL 2020-01-25 tcpm7 0 0
## 3 01015 2 113605 AL 2020-01-26 tdpm7 0 0
## 4 01015 2 113605 AL 2020-01-26 tcpm7 0 0
## 5 01015 2 113605 AL 2020-01-27 tdpm7 0 0
## 6 01015 2 113605 AL 2020-01-27 tcpm7 0 0
## 7 01015 2 113605 AL 2020-01-28 tdpm7 0 0
## 8 01015 2 113605 AL 2020-01-28 tcpm7 0 0
## 9 01015 2 113605 AL 2020-01-29 tdpm7 0 0
## 10 01015 2 113605 AL 2020-01-29 tcpm7 0 0
## # ... with 234,102 more rows
# Create distance matrix and hierarchical clusters
p2Dist_Seg_23 <- p2Data_Seg_23 %>%
filter(name=="tdpm7") %>%
select(countyFIPS, date, value) %>%
pivotData(pivotKeys="countyFIPS", toLonger = FALSE, nameVar="date", valVar="value") %>%
column_to_rownames("countyFIPS") %>%
as.matrix() %>%
dist()
# Run clustering with method "complete"
p2Complete_Seg_23 <- hclust(p2Dist_Seg_23, method="complete")
plot(p2Complete_Seg_23)
# Plot cluster summaries
plotList_Seg_23 <- lapply(2:5,
FUN=function(x) plotHierarchical(p2Complete_Seg_23,
k=x,
df=filter(p2Data_Seg_23, pop>=100000),
keyVar="value",
facetScale="free_y"
)
)
## .
## 1 2
## 81 155
## Joining, by = "countyFIPS"
## .
## 1 2 3
## 81 144 11
## Joining, by = "countyFIPS"
## .
## 1 2 3 4
## 48 144 33 11
## Joining, by = "countyFIPS"
## .
## 1 2 3 4 5
## 46 2 144 33 11
## Joining, by = "countyFIPS"
# Create the four cluster summary
dfClust_Seg_23_k4 <- cutree(p2Complete_Seg_23, k=4) %>%
vecToTibble(colNameData="tempCluster", colNameName="countyFIPS") %>%
mutate(cluster=factor(tempCluster))
# Comparison to original hierarchical cluster
dfClust_Seg_23_k4 %>%
left_join(select(distOnlyClust_new, countyFIPS, hierCluster), by="countyFIPS") %>%
count(hierCluster, cluster)
## # A tibble: 4 x 3
## hierCluster cluster n
## <fct> <fct> <int>
## 1 2 1 48
## 2 2 3 33
## 3 3 2 144
## 4 3 4 11
# Map of the new clusters
dfClust_Seg_23_k4 %>%
select(fips=countyFIPS, cluster) %>%
usmap::plot_usmap(regions="counties", data=., values="cluster") +
scale_fill_brewer("Cluster", palette="Set1")
# Plot the four cluster summary
clData_Seg_23_k4 <- plotHierarchical(obj=select(dfClust_Seg_23_k4, countyFIPS, cluster),
df=p2Data_Seg_23,
keyVar="value",
facetScale="free_y"
)
## Joining, by = "countyFIPS"
As expected, each of the original segments is split, helping to bring out a mix of shape and total burden:
While small, there is meaningful value in the ~2000 deaths per million cluster with early disease. It is focused on major metros (Chicago, Detroit, Philadelphia, New Orleans, NYC collar) that had early spikes similar to core NYC counties but without the associated high levels of total burden. These are distinct from the much larger group of counties that got to ~2000 deaths per million with a much later timing.
Next steps are to create a full clustering database and extend to counties with under 100k population.
First, the segments for counties with 100k+ are combined:
dfClust100k_full <- dfClust100k_k8 %>%
select(countyFIPS, origCluster=cluster) %>%
full_join(select(dfClust_Seg_23_k4, countyFIPS, newCluster=cluster), by="countyFIPS") %>%
mutate(finalCluster=ifelse(is.na(newCluster),
as.integer(as.character(origCluster)),
10*as.integer(as.character(origCluster)) + as.integer(as.character(newCluster))
),
finalCluster=factor(finalCluster, levels=sort(unique(finalCluster)))
)
dfClust100k_full
## # A tibble: 599 x 4
## countyFIPS origCluster newCluster finalCluster
## <chr> <fct> <fct> <fct>
## 1 01003 1 <NA> 1
## 2 01015 2 1 21
## 3 01055 2 1 21
## 4 01069 2 1 21
## 5 01073 2 1 21
## 6 01081 1 <NA> 1
## 7 01089 1 <NA> 1
## 8 01097 3 2 32
## 9 01101 2 3 23
## 10 01103 2 1 21
## # ... with 589 more rows
dfClust100k_full %>%
count(origCluster, newCluster, finalCluster)
## # A tibble: 7 x 4
## origCluster newCluster finalCluster n
## <fct> <fct> <fct> <int>
## 1 1 <NA> 1 319
## 2 2 1 21 48
## 3 2 3 23 33
## 4 3 2 32 144
## 5 3 4 34 11
## 6 9 <NA> 9 21
## 7 5 <NA> 5 23
# Plot the seven cluster summary
clData_Seg_100k_full <- plotHierarchical(obj=select(dfClust100k_full, countyFIPS, cluster=finalCluster),
df=p2DataAll,
keyVar="value",
facetScale="free_y"
)
## Joining, by = "countyFIPS"
# Map of the new clusters
dfClust100k_full %>%
select(fips=countyFIPS, cluster=finalCluster) %>%
usmap::plot_usmap(regions="counties", data=., values="cluster") +
scale_fill_brewer("Cluster", palette="Set1")
Smaller counties are then mapped to the nearest segment average:
# Calculate distances to cluster mean, then print
countyClusterDist_100k_full <- tempDistCluster(dfClust=clData_Seg_100k_full, dfAll=p2All, typeUse="value")
countyClusterDist_100k_full
## # A tibble: 21,994 x 3
## cluster countyFIPS dist
## <fct> <chr> <dbl>
## 1 32 01001 6770143.
## 2 21 01001 46397990.
## 3 1 01001 88240896.
## 4 34 01001 123376092.
## 5 23 01001 125257271.
## 6 5 01001 353564016.
## 7 9 01001 983953954.
## 8 1 01003 14296132.
## 9 32 01003 51092603.
## 10 21 01003 143224481.
## # ... with 21,984 more rows
# Assess overlap of cluster for counties already clusterd
tmpDistDF_100k_full <- dfClust100k_full %>%
select(countyFIPS, finalCluster) %>%
left_join(countyClusterDist_100k_full, by="countyFIPS") %>%
arrange(countyFIPS, dist)
tmpDistDF_100k_full %>%
group_by(countyFIPS) %>%
filter(row_number()==1) %>%
ungroup() %>%
mutate(cluster=factor(as.integer(as.character(cluster)))) %>%
count(hierCluster=finalCluster, cluster) %>%
group_by(hierCluster) %>%
mutate(pct=n/sum(n)) %>%
ungroup() %>%
ggplot(aes(x=cluster, y=hierCluster)) +
geom_tile(aes(fill=pct)) +
geom_text(aes(label=paste0("n=", n, "\n(", round(100*pct), "%)"))) +
scale_fill_continuous(low="white", high="lightgreen") +
labs(title="Overlap of assigned hierarchical cluster and closest hierarchical cluster mean",
x="Closest hierarchical cluster mean (k=5)",
y="Assigned hierarchical cluster (k=5)"
)
# Full clustering database
distOnlyClust_100k_full <- countyClusterDist_100k_full %>%
arrange(countyFIPS, dist) %>%
group_by(countyFIPS) %>%
filter(row_number()==1) %>%
ungroup() %>%
left_join(select(dfClust100k_full, countyFIPS, hierCluster=finalCluster), by="countyFIPS")
# Cluster assignments depending on size of county
distOnlyClust_100k_full %>%
ggplot(aes(x=cluster)) +
geom_bar(fill="lightblue") +
geom_text(stat="count", aes(y=..count../2, label=..count..)) +
facet_wrap(~ifelse(is.na(hierCluster), "Not in hierarchical data", "In hierarchical data"),
scales="free_y"
) +
labs(title="Closest hierarchical cluster")
# Plot all counties based on closest cluster
sparseCountyClusterMap(rename(distOnlyClust_100k_full, fips=countyFIPS) %>%
mutate(cluster=factor(cluster, levels=c(1, 32, 34, 21, 23, 5, 9))),
brewPalette="viridis",
caption="Closest hierarchical cluster mean (euclidean distance of % cumulative death)"
) +
labs(title="Lighter clusters have higher burden and/or higher percentage of burden early")
There continues to be meaningful patterns by geography. Next steps are to download the latest data and run the full segment diagnostics.
New data are downloaded, with the above segments applied:
readList <- list("usafCase"="./RInputFiles/Coronavirus/covid_confirmed_usafacts_downloaded_20210622.csv",
"usafDeath"="./RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20210622.csv"
)
compareList <- list("usafCase"=cty_newdata_20210608$dfRaw$usafCase,
"usafDeath"=cty_newdata_20210608$dfRaw$usafDeath
)
dfClust <- distOnlyClust_100k_full %>%
select(countyFIPS, cluster) %>%
mutate(cluster=as.character(cluster)) %>%
left_join(getCountyData(), by="countyFIPS") %>%
mutate(origCluster=cluster, cluster=ifelse(pop>=25000, cluster, "999"))
useClusters <- as.character(dfClust$cluster) %>%
purrr::set_names(dfClust$countyFIPS)
# Create new clusters
cty_newdata_20210622 <- readRunUSAFacts(maxDate="2021-06-20",
downloadTo=lapply(readList,
FUN=function(x) if(file.exists(x)) NA else x
),
readFrom=readList,
compareFile=compareList,
writeLog="./RInputFiles/Coronavirus/USAF_NewData_20210622_v005.log",
ovrwriteLog=TRUE,
useClusters=useClusters,
skipAssessmentPlots=FALSE,
brewPalette="Paired"
)
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## `County Name` = col_character(),
## State = col_character(),
## StateFIPS = col_character()
## )
## i Use `spec()` for the full column specifications.
##
## *** File has been checked for uniqueness by: countyFIPS countyName state stateFIPS
##
##
## *** File has been checked for uniqueness by: countyFIPS stateFIPS date
##
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference:
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 14
## Detailed differences available in: ./RInputFiles/Coronavirus/USAF_NewData_20210622_v005.log
##
## Checking for similarity of: county
## In reference but not in current:
## In current but not in reference:
##
##
## ***Differences of at least 5 and at least 5%
##
## 0 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210622_v005.log
##
##
## ***Differences of at least 0 and at least 0.1%
##
## 0 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210622_v005.log
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## `County Name` = col_character(),
## State = col_character(),
## StateFIPS = col_character()
## )
## i Use `spec()` for the full column specifications.
##
## *** File has been checked for uniqueness by: countyFIPS countyName state stateFIPS
##
##
## *** File has been checked for uniqueness by: countyFIPS stateFIPS date
##
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference:
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 14
## Detailed differences available in: ./RInputFiles/Coronavirus/USAF_NewData_20210622_v005.log
##
## Checking for similarity of: county
## In reference but not in current:
## In current but not in reference:
##
##
## ***Differences of at least 5 and at least 5%
##
## 0 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210622_v005.log
##
##
## ***Differences of at least 0 and at least 0.1%
##
## 0 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210622_v005.log
##
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 4
## isType cases new_cases n
## <chr> <dbl> <dbl> <dbl>
## 1 before 6.66e+9 3.29e+7 1647588
## 2 after 6.63e+9 3.28e+7 1621272
## 3 pctchg 4.52e-3 3.78e-3 0.0160
##
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 4
## isType deaths new_deaths n
## <chr> <dbl> <dbl> <dbl>
## 1 before 1.34e+8 595239 1647588
## 2 after 1.34e+8 593224 1621272
## 3 pctchg 3.70e-3 0.00339 0.0160
## NULL
# Plot all counties based on closest cluster
sparseCountyClusterMap(cty_newdata_20210622$useClusters,
caption="Includes only counties with 25k+ population",
brewPalette="viridis"
)
saveToRDS(cty_newdata_20210622, ovrWriteError=FALSE)
The updated county clustering routines appear to work as intended. Next steps are to update the CDC excess deaths process.
The latest data are downloaded, with existing clusters applied:
readList <- list("usafCase"="./RInputFiles/Coronavirus/covid_confirmed_usafacts_downloaded_20210813.csv",
"usafDeath"="./RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20210813.csv"
)
compareList <- list("usafCase"=readFromRDS("cty_newdata_20210622")$dfRaw$usafCase,
"usafDeath"=readFromRDS("cty_newdata_20210622")$dfRaw$usafDeath
)
# Create new clusters
cty_newdata_20210813 <- readRunUSAFacts(maxDate="2021-08-11",
downloadTo=lapply(readList,
FUN=function(x) if(file.exists(x)) NA else x
),
readFrom=readList,
compareFile=compareList,
writeLog="./RInputFiles/Coronavirus/USAF_NewData_20210813_v005.log",
ovrwriteLog=TRUE,
useClusters=readFromRDS("cty_newdata_20210622")$useClusters,
skipAssessmentPlots=FALSE,
brewPalette="Paired"
)
##
## No file has been downloaded, will use existing file: ./RInputFiles/Coronavirus/covid_confirmed_usafacts_downloaded_20210813.csv
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## `County Name` = col_character(),
## State = col_character(),
## StateFIPS = col_character()
## )
## i Use `spec()` for the full column specifications.
##
## *** File has been checked for uniqueness by: countyFIPS countyName state stateFIPS
##
##
## *** File has been checked for uniqueness by: countyFIPS stateFIPS date
##
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference:
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 51
## Detailed differences available in: ./RInputFiles/Coronavirus/USAF_NewData_20210813_v005.log
##
## Checking for similarity of: county
## In reference but not in current:
## In current but not in reference:
##
##
## ***Differences of at least 5 and at least 5%
##
## 0 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210813_v005.log
##
##
## ***Differences of at least 0 and at least 0.1%
##
## 0 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210813_v005.log
##
##
## No file has been downloaded, will use existing file: ./RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20210813.csv
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## `County Name` = col_character(),
## State = col_character(),
## StateFIPS = col_character()
## )
## i Use `spec()` for the full column specifications.
##
## *** File has been checked for uniqueness by: countyFIPS countyName state stateFIPS
##
##
## *** File has been checked for uniqueness by: countyFIPS stateFIPS date
##
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference:
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 51
## Detailed differences available in: ./RInputFiles/Coronavirus/USAF_NewData_20210813_v005.log
##
## Checking for similarity of: county
## In reference but not in current:
## In current but not in reference:
##
##
## ***Differences of at least 5 and at least 5%
##
## 0 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210813_v005.log
##
##
## ***Differences of at least 0 and at least 0.1%
##
## 0 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_NewData_20210813_v005.log
##
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 4
## isType cases new_cases n
## <chr> <dbl> <dbl> <dbl>
## 1 before 8.38e+9 3.54e+7 1810431
## 2 after 8.34e+9 3.52e+7 1781514
## 3 pctchg 4.49e-3 5.53e-3 0.0160
##
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 4
## isType deaths new_deaths n
## <chr> <dbl> <dbl> <dbl>
## 1 before 1.65e+8 611955 1810431
## 2 after 1.64e+8 607482 1781514
## 3 pctchg 3.93e-3 0.00731 0.0160
## NULL
# Plot all counties based on closest cluster
sparseCountyClusterMap(cty_newdata_20210813$useClusters,
caption="Includes only counties with 25k+ population",
brewPalette="viridis"
)
# Dates with large amounts of negative restatement
cty_newdata_20210813$dfPerCapita %>%
group_by(date) %>%
summarize(nNeg=sum(new_cases < 0), nPos=sum(new_cases > 0), new_cases=sum(new_cases)) %>%
arrange(-nNeg)
## # A tibble: 567 x 4
## date nNeg nPos new_cases
## <date> <int> <int> <dbl>
## 1 2021-07-09 310 1797 -399248
## 2 2020-12-28 266 2372 216179
## 3 2020-12-25 262 1350 49223
## 4 2020-10-24 256 2230 53241
## 5 2021-06-11 205 1706 -9167
## 6 2021-06-23 191 1629 15003
## 7 2021-06-18 184 1442 20155
## 8 2021-06-25 183 1545 24105
## 9 2021-04-16 174 2237 65229
## 10 2021-06-30 174 1610 10814
## # ... with 557 more rows
# Restatements taking place between July 4-14, 2021
cty_newdata_20210813$dfPerCapita %>%
group_by(date) %>%
summarize(nNeg=sum(new_cases < 0), nPos=sum(new_cases > 0), new_cases=sum(new_cases)) %>%
filter(date >= "2021-07-04", date <= "2021-07-14")
## # A tibble: 11 x 4
## date nNeg nPos new_cases
## <date> <int> <int> <dbl>
## 1 2021-07-04 8 333 2737
## 2 2021-07-05 12 395 4228
## 3 2021-07-06 57 1617 27035
## 4 2021-07-07 97 1756 21302
## 5 2021-07-08 129 1613 19276
## 6 2021-07-09 310 1797 -399248
## 7 2021-07-10 17 653 452834
## 8 2021-07-11 8 362 3435
## 9 2021-07-12 59 1596 37864
## 10 2021-07-13 89 1588 21977
## 11 2021-07-14 97 1984 32297
# Restatements taking place between December 21, 2020 and January 6, 2021
cty_newdata_20210813$dfPerCapita %>%
group_by(date) %>%
summarize(nNeg=sum(new_cases < 0), nPos=sum(new_cases > 0), new_cases=sum(new_cases)) %>%
filter(date >= "2020-12-21", date <= "2021-01-06")
## # A tibble: 17 x 4
## date nNeg nPos new_cases
## <date> <int> <int> <dbl>
## 1 2020-12-21 8 2659 362667
## 2 2020-12-22 2 2514 174503
## 3 2020-12-23 0 2567 227200
## 4 2020-12-24 3 2345 185122
## 5 2020-12-25 262 1350 49223
## 6 2020-12-26 33 1964 170262
## 7 2020-12-27 5 2018 121643
## 8 2020-12-28 266 2372 216179
## 9 2020-12-29 5 2510 229421
## 10 2020-12-30 20 2698 238917
## 11 2020-12-31 32 2406 222938
## 12 2021-01-01 4 1603 190655
## 13 2021-01-02 0 2346 283820
## 14 2021-01-03 8 2470 229336
## 15 2021-01-04 4 2484 169859
## 16 2021-01-05 0 2773 238758
## 17 2021-01-06 0 2885 256350
saveToRDS(cty_newdata_20210813, ovrWriteError=FALSE)
There is a very large restatement of new_cases data occurring on or around July 8-11, 2021. Perhaps smoothing can be considered where each value for July 8-11, 2021 data is treated as the average across those four days. There is a similar though smaller anomaly in the December 24-29, 2020 range.
Dates are examined for those with the largest number of negative revisions in new_cases and/or new_deaths:
# All restatements
plotData <- cty_newdata_20210813$dfPerCapita %>%
select(date, new_cases, new_deaths, cpm7, dpm7) %>%
pivot_longer(-date) %>%
filter(!is.na(value)) %>%
group_by(date, name) %>%
summarize(nNeg=sum(value <= -1), sumNeg=sum(ifelse(value < 0, value, 0)), .groups="drop")
tempMapper <- c("cpm7"="Cases per million (7-day mean)",
"dpm7"="Deaths per million (7-day mean)",
"new_cases"="New cases",
"new_deaths"="New deaths"
)
plotData %>%
ggplot(aes(x=date, y=nNeg)) +
geom_line() +
facet_wrap(~tempMapper[name]) +
labs(x=NULL,
y="Number of records",
title="Counties reporting less than or equal to -1 for metric on day"
)
plotData %>%
ggplot(aes(x=date, y=sumNeg)) +
geom_line() +
facet_wrap(~tempMapper[name], scales="free_y") +
labs(x=NULL,
y="Sum of value for counties recording negative",
title="Counties reporting less than 0 for metric on day"
)
For records with negative values, the rolling-7 is calculated around the day of the negative:
# All restatements
plotData <- cty_newdata_20210813$dfPerCapita %>%
select(date, countyFIPS, new_cases, new_deaths, cpm7, dpm7) %>%
pivot_longer(-c(date, countyFIPS)) %>%
filter(!is.na(value)) %>%
group_by(countyFIPS, name) %>%
arrange(date) %>%
mutate(value7=zoo::rollmean(value, k=7, fill=NA),
isNeg=(value <= -1),
isNeg7=(is.na(value7) | value7 < 0)
) %>%
group_by(date, name) %>%
summarize(nNeg=sum(isNeg & isNeg7), sumNeg=sum(ifelse(isNeg & isNeg7, value, 0)), .groups="drop")
tempMapper <- c("cpm7"="Cases per million (7-day mean)",
"dpm7"="Deaths per million (7-day mean)",
"new_cases"="New cases",
"new_deaths"="New deaths"
)
plotData %>%
ggplot(aes(x=date, y=nNeg)) +
geom_line() +
facet_wrap(~tempMapper[name]) +
labs(x=NULL,
y="Number of records",
title="Counties reporting negative value that are also negative over the 7-day window"
)
plotData %>%
ggplot(aes(x=date, y=sumNeg)) +
geom_line() +
facet_wrap(~tempMapper[name], scales="free_y") +
labs(x=NULL,
y="Sum of value",
title="Counties reporting less than 0 for metric on day that are also negative over the 7-day window"
)
cty_newdata_20210813$dfPerCapita %>%
select(date, countyFIPS, new_cases, new_deaths, cpm7, dpm7) %>%
pivot_longer(-c(date, countyFIPS)) %>%
filter(!is.na(value)) %>%
group_by(countyFIPS, name) %>%
arrange(date) %>%
mutate(value7=zoo::rollmean(value, k=7, fill=NA),
isNeg=(value <= -1),
isNeg7=(is.na(value7) | value7 < 0)
) %>%
ungroup() %>%
group_by(name, isNeg, isNeg7) %>%
summarize(n=n(), sumValue=sum(value))
## `summarise()` has grouped output by 'name', 'isNeg'. You can override using the `.groups` argument.
## # A tibble: 16 x 5
## # Groups: name, isNeg [8]
## name isNeg isNeg7 n sumValue
## <chr> <lgl> <lgl> <int> <dbl>
## 1 cpm7 FALSE FALSE 1701869 336714571.
## 2 cpm7 FALSE TRUE 45946 3248568.
## 3 cpm7 TRUE FALSE 3859 -265077.
## 4 cpm7 TRUE TRUE 10988 -2790977.
## 5 dpm7 FALSE FALSE 1627092 6667360.
## 6 dpm7 FALSE TRUE 126669 20031.
## 7 dpm7 TRUE FALSE 678 -4144.
## 8 dpm7 TRUE TRUE 8223 -102243.
## 9 new_cases FALSE FALSE 1732888 35567937
## 10 new_cases FALSE TRUE 30335 362101
## 11 new_cases TRUE FALSE 14909 -600467
## 12 new_cases TRUE TRUE 3382 -114828
## 13 new_deaths FALSE FALSE 1749115 616693
## 14 new_deaths FALSE TRUE 27664 2807
## 15 new_deaths TRUE FALSE 3087 -7199
## 16 new_deaths TRUE TRUE 1648 -4819
Making an adjustment for situations where a metrics is negative but the 7-day total is non-negative can help to smooth some restatement anomalies. There sill still be restatements in the data, and these seem to increasein the more recent months of the data.
Next steps are to build a function to manage the smoothing, with a goal to incorporate this as part of per-capita processing.